<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Pattern capture token filter | Elasticsearch Guide [7.7] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch Guide [7.7]">
<link rel="up" href="analysis-tokenfilters.html" title="Token filter reference">
<link rel="prev" href="analysis-normalization-tokenfilter.html" title="Normalization token filters">
<link rel="next" href="analysis-pattern_replace-tokenfilter.html" title="Pattern replace token filter">
<meta name="DC.type" content="Learn/Docs/Elasticsearch/Reference/7.7">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="7.7">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<strong>IMPORTANT</strong>: No additional bug fixes or documentation updates
will be released for this version. For the latest information, see the
<a href="../current/index.html">current release documentation</a>.
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch Guide [7.7]</a></span>
»
<span class="breadcrumb-link"><a href="analysis.html">Text analysis</a></span>
»
<span class="breadcrumb-link"><a href="analysis-tokenfilters.html">Token filter reference</a></span>
»
<span class="breadcrumb-node">Pattern capture token filter</span>
</div>
<div class="navheader">
<span class="prev">
<a href="analysis-normalization-tokenfilter.html">« Normalization token filters</a>
</span>
<span class="next">
<a href="analysis-pattern_replace-tokenfilter.html">Pattern replace token filter »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="analysis-pattern-capture-tokenfilter"></a>Pattern capture token filter<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch/edit/7.7/docs/reference/analysis/tokenfilters/pattern-capture-tokenfilter.asciidoc">edit</a>
</h2>
</div></div></div>

<p>The <code class="literal">pattern_capture</code> token filter, unlike the <code class="literal">pattern</code> tokenizer,
emits a token for every capture group in the regular expression.
Patterns are not anchored to the beginning and end of the string, so
each pattern can match multiple times, and matches are allowed to
overlap.</p>
<div class="warning admon">
<div class="icon"></div>
<div class="admon_content">
<h3>Beware of Pathological Regular Expressions</h3>
<p>The pattern capture token filter uses
<a href="http://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" class="ulink" target="_top">Java Regular Expressions</a>.</p>
<p>A badly written regular expression could run very slowly or even throw a
StackOverflowError and cause the node it is running on to exit suddenly.</p>
<p>Read more about <a href="http://www.regular-expressions.info/catastrophic.html" class="ulink" target="_top">pathological regular expressions and how to avoid them</a>.</p>
</div>
</div>
<p>For instance a pattern like :</p>
<div class="pre_wrapper lang-text">
<pre class="programlisting prettyprint lang-text">"(([a-z]+)(\d*))"</pre>
</div>
<p>when matched against:</p>
<div class="pre_wrapper lang-text">
<pre class="programlisting prettyprint lang-text">"abc123def456"</pre>
</div>
<p>would produce the tokens: [ <code class="literal">abc123</code>, <code class="literal">abc</code>, <code class="literal">123</code>, <code class="literal">def456</code>, <code class="literal">def</code>,
<code class="literal">456</code> ]</p>
<p>If <code class="literal">preserve_original</code> is set to <code class="literal">true</code> (the default) then it would also
emit the original token: <code class="literal">abc123def456</code>.</p>
<p>This is particularly useful for indexing text like camel-case code, eg
<code class="literal">stripHTML</code> where a user may search for <code class="literal">"strip html"</code> or <code class="literal">"striphtml"</code>:</p>
<div class="pre_wrapper lang-console">
<pre class="programlisting prettyprint lang-console">PUT test
{
   "settings" : {
      "analysis" : {
         "filter" : {
            "code" : {
               "type" : "pattern_capture",
               "preserve_original" : true,
               "patterns" : [
                  "(\\p{Ll}+|\\p{Lu}\\p{Ll}+|\\p{Lu}+)",
                  "(\\d+)"
               ]
            }
         },
         "analyzer" : {
            "code" : {
               "tokenizer" : "pattern",
               "filter" : [ "code", "lowercase" ]
            }
         }
      }
   }
}</pre>
</div>
<div class="console_widget" data-snippet="snippets/965.console"></div>
<p>When used to analyze the text</p>
<div class="pre_wrapper lang-java">
<pre class="programlisting prettyprint lang-java">import static org.apache.commons.lang.StringEscapeUtils.escapeHtml</pre>
</div>
<p>this emits the tokens: [ <code class="literal">import</code>, <code class="literal">static</code>, <code class="literal">org</code>, <code class="literal">apache</code>, <code class="literal">commons</code>,
<code class="literal">lang</code>, <code class="literal">stringescapeutils</code>, <code class="literal">string</code>, <code class="literal">escape</code>, <code class="literal">utils</code>, <code class="literal">escapehtml</code>,
<code class="literal">escape</code>, <code class="literal">html</code> ]</p>
<p>Another example is analyzing email addresses:</p>
<div class="pre_wrapper lang-console">
<pre class="programlisting prettyprint lang-console">PUT test
{
   "settings" : {
      "analysis" : {
         "filter" : {
            "email" : {
               "type" : "pattern_capture",
               "preserve_original" : true,
               "patterns" : [
                  "([^@]+)",
                  "(\\p{L}+)",
                  "(\\d+)",
                  "@(.+)"
               ]
            }
         },
         "analyzer" : {
            "email" : {
               "tokenizer" : "uax_url_email",
               "filter" : [ "email", "lowercase",  "unique" ]
            }
         }
      }
   }
}</pre>
</div>
<div class="console_widget" data-snippet="snippets/966.console"></div>
<p>When the above analyzer is used on an email address like:</p>
<div class="pre_wrapper lang-text">
<pre class="programlisting prettyprint lang-text">john-smith_123@foo-bar.com</pre>
</div>
<p>it would produce the following tokens:</p>
<pre class="literallayout">john-smith_123@foo-bar.com, john-smith_123,
john, smith, 123, foo-bar.com, foo, bar, com</pre>

<p>Multiple patterns are required to allow overlapping captures, but also
means that patterns are less dense and easier to understand.</p>
<p><span class="strong strong"><strong>Note:</strong></span> All tokens are emitted in the same position, and with the same
character offsets. This means, for example, that a <code class="literal">match</code> query for
<code class="literal">john-smith_123@foo-bar.com</code> that uses this analyzer will return documents
containing any of these tokens, even when using the <code class="literal">and</code> operator.
Also, when combined with highlighting, the whole original token will
be highlighted, not just the matching subset. For instance, querying
the above email address for <code class="literal">"smith"</code> would highlight:</p>
<div class="pre_wrapper lang-html">
<pre class="programlisting prettyprint lang-html">  &lt;em&gt;john-smith_123@foo-bar.com&lt;/em&gt;</pre>
</div>
<p>not:</p>
<div class="pre_wrapper lang-html">
<pre class="programlisting prettyprint lang-html">  john-&lt;em&gt;smith&lt;/em&gt;_123@foo-bar.com</pre>
</div>
</div>
<div class="navfooter">
<span class="prev">
<a href="analysis-normalization-tokenfilter.html">« Normalization token filters</a>
</span>
<span class="next">
<a href="analysis-pattern_replace-tokenfilter.html">Pattern replace token filter »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
